Use Global Infection Data to Analysis¶

In [1]:
import pandas as pd

# Read the Excel file
try:
    df = pd.read_excel('world_covid.xlsx')
    print("Excel file read successfully")
except FileNotFoundError:
    print("File not found, please check the file path")
    raise

# Convert the 'date' column to datetime type
try:
    df['date'] = pd.to_datetime(df['date'])
    print("Date column converted successfully")
except KeyError:
    print("Date column 'date' not found, please check the column name")
    raise

# Group by month and calculate the total cases for each month
try:
    df['month'] = df['date'].dt.to_period('M')
    monthly_cases = df.groupby('month')['new_cases'].sum()
    print("Monthly cases calculated successfully")
except KeyError:
    print("Cases column 'new_cases' not found, please check the column name")
    raise

# Save the result in a new DataFrame
result_df_world = pd.DataFrame({
    'month': monthly_cases.index.astype(str),
    'new_cases': monthly_cases.values
})

# Display the result
print(result_df_world)
Excel file read successfully
Date column converted successfully
Monthly cases calculated successfully
      month  new_cases
0   2020-01       2033
1   2020-02      76214
2   2020-03     611707
3   2020-04    2037922
4   2020-05    3163451
5   2020-06    3936842
6   2020-07    6074723
7   2020-08    9261926
8   2020-09    8143085
9   2020-10   10187462
10  2020-11   19566436
11  2020-12   17255870
12  2021-01   22070633
13  2021-02   11022033
14  2021-03   12861645
15  2021-04   19601773
16  2021-05   23734840
17  2021-06   10822935
18  2021-07   13410750
19  2021-08   22253836
20  2021-09   15484337
21  2021-10   15086099
22  2021-11   14569045
23  2021-12   19380554
24  2022-01   94694159
25  2022-02   59761654
26  2022-03   46082485
27  2022-04   26970423
28  2022-05   18751417
29  2022-06   15145267
30  2022-07   32981058
31  2022-08   22850430
32  2022-09   14173487
33  2022-10   15057500
34  2022-11   10483582
35  2022-12   67065915
36  2023-01   47674052
37  2023-02    4851818
38  2023-03    3635718
39  2023-04    3617660
40  2023-05    1895281
41  2023-06     918530
42  2023-07    1127580
43  2023-08    1536305
44  2023-09     830747
45  2023-10     758741
46  2023-11     776969
47  2023-12    1614765
48  2024-01     640260
49  2024-02     344408
50  2024-03     355645
51  2024-04     147440
52  2024-05     136159
53  2024-06     190568
54  2024-07     201714
55  2024-08      47169
In [2]:
import pandas as pd
from dateutil import parser

# Read the existing result_df DataFrame

# Convert the 'month' column in result_df to Period type
result_df_world['month'] = pd.to_datetime(result_df_world['month']).dt.to_period('M')

# Read the Monthly_figures_on_aviation CSV file
aviation_df = pd.read_csv('Monthly_figures_on_aviation.csv')

# Assume aviation_df has a date column 'Periods' and other data columns
# Use dateutil.parser to parse the date column
aviation_df['date'] = aviation_df['Periods'].apply(lambda x: parser.parse(x, fuzzy=True))

# Group by month
aviation_df['month'] = aviation_df['date'].dt.to_period('M')

# Drop the original date column
aviation_df = aviation_df.drop(columns=['Periods'])

# Merge the two DataFrames by month, filling unmatched months with 0 for new_cases
merged_df = pd.merge(result_df_world, aviation_df, on='month', how='outer').fillna({'new_cases': 0})

# Display the result
print(merged_df)
      month  new_cases              Airports  Cross-country flights (number)  \
0   2019-01        0.0  Total Dutch airports                           43492   
1   2019-02        0.0  Total Dutch airports                           41972   
2   2019-03        0.0  Total Dutch airports                           47712   
3   2019-04        0.0  Total Dutch airports                           51400   
4   2019-05        0.0  Total Dutch airports                           55561   
..      ...        ...                   ...                             ...   
63  2024-04   147440.0  Total Dutch airports                           48019   
64  2024-05   136159.0  Total Dutch airports                           52221   
65  2024-06   190568.0  Total Dutch airports                           51320   
66  2024-07   201714.0  Total Dutch airports                           53018   
67  2024-08    47169.0  Total Dutch airports                           53309   

    Local flights (number)  \
0                     3547   
1                     3564   
2                     3992   
3                     4990   
4                     4923   
..                     ...   
63                    7738   
64                    8225   
65                    9109   
66                    9289   
67                    8486   

    Commercial air traffic/Flights/All flights/Total flights (number)  \
0                                               41285                   
1                                               39204                   
2                                               44649                   
3                                               47934                   
4                                               51719                   
..                                                ...                   
63                                              44627                   
64                                              48215                   
65                                              46924                   
66                                              48827                   
67                                              49249                   

    Commercial air traffic/Flights/All flights/Scheduled (number)  \
0                                               40217               
1                                               38408               
2                                               43828               
3                                               46845               
4                                               50253               
..                                                ...               
63                                              43671               
64                                              47026               
65                                              45685               
66                                              47230               
67                                              47775               

    Total passengers (number)  Total cargo (ton)  Total mail (ton)       date  
0                     5495052             133445              1615 2019-01-09  
1                     5352301             128322              1398 2019-02-09  
2                     6270788             155427              1761 2019-03-09  
3                     6926059             138159              1747 2019-04-09  
4                     7396960             144540              1860 2019-05-09  
..                        ...                ...               ...        ...  
63                    6274032             120928               498 2024-04-09  
64                    6930388             126163               523 2024-05-09  
65                    6884361             125021               511 2024-06-09  
66                    7251748             129985               500 2024-07-09  
67                    7375926             128625               490 2024-08-09  

[68 rows x 11 columns]

Analysis and Visualization of Question 1 and 2¶

In this section, we analyze the global COVID-19 infection data and its impact on aviation activities. The analysis includes the following steps:

  1. Data Reading and Preprocessing:

    • Read the global COVID-19 infection data from an Excel file.
    • Convert the 'date' column to datetime format.
    • Group the data by month and calculate the total number of new cases for each month.
  2. Merging with Aviation Data:

    • Read the monthly aviation data from a CSV file.
    • Parse the date column and group the data by month.
    • Merge the COVID-19 data with the aviation data on the 'month' column.
  3. Visualization:

    • Plot the number of new COVID-19 cases and various aviation-related variables over time.
    • Use line charts to visualize the trends and relationships between the variables.

The following plots are generated to visualize the data:

  1. Aircraft Movements and COVID-19 New Cases Over Time:

    • Cross-country flights and local flights are plotted on the primary y-axis.
    • New COVID-19 cases are plotted on the secondary y-axis.
  2. Commercial Air Traffic and COVID-19 New Cases Over Time:

    • Total passengers, total cargo, and total mail are plotted on the primary y-axis.
    • New COVID-19 cases are plotted on the secondary y-axis.

These visualizations help us understand the impact of the COVID-19 pandemic on aviation activities and identify any potential correlations between the variables.

In [3]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Convert 'month' column to string format for plotting
merged_df['month'] = merged_df['month'].astype(str)

# Set the figure size
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot the first three columns as line charts on the first y-axis
ax1.plot(merged_df['month'], merged_df['Cross-country flights (number)'], label='Cross-country flights', color='blue')
ax1.plot(merged_df['month'], merged_df['Local flights (number)'], label='Local flights', color='green')

# Set the labels and title for the first y-axis
ax1.set_xlabel('Date')
ax1.set_ylabel('Number of Flights')
ax1.set_title('Aircraft Movements and COVID-19 New Cases Over Time')
ax1.legend(loc='upper left')

# Rotate x-axis labels for better readability
ax1.set_xticks(ax1.get_xticks()[::2])  # Show every second label
ax1.set_xticklabels(merged_df['month'][::2], rotation=45, ha='right')

# Create a second y-axis for the new cases
ax2 = ax1.twinx()
ax2.plot(merged_df['month'], merged_df['new_cases'], label='New Cases', color='red')

# Set the label for the second y-axis with unit
ax2.set_ylabel('Number of New Cases')
ax2.legend(loc='upper right')

# Format the second y-axis to remove scientific notation
ax2.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x):,}'))

# Adjust layout to prevent clipping of tick-labels
fig.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image
In [4]:
import matplotlib.pyplot as plt

# Convert 'month' column to string format for plotting
merged_df['month'] = merged_df['month'].astype(str)

# Set the figure size for the first plot
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot the first of the last three columns as a line chart on the first y-axis
ax1.plot(merged_df['month'], merged_df.iloc[:, -4], label=merged_df.columns[-4], color='blue')

# Set the labels and title for the first y-axis
ax1.set_xlabel('Date')
ax1.set_ylabel('Number of Passengers')
ax1.set_title('Aircraft Movements and COVID-19 New Cases Over Time')
ax1.legend(loc='upper left')

# Rotate x-axis labels for better readability
ax1.set_xticks(ax1.get_xticks()[::2])  # Show every second label
ax1.set_xticklabels(merged_df['month'][::2], rotation=45, ha='right')

# Create a second y-axis for the new cases
ax2 = ax1.twinx()
ax2.plot(merged_df['month'], merged_df['new_cases'], label='New Cases', color='red')

# Set the label for the second y-axis with unit
ax2.set_ylabel('Number of New Cases')
ax2.legend(loc='upper right')

# Adjust layout to prevent clipping of tick-labels
fig.tight_layout()

# Show the first plot
plt.show()

# Set the figure size for the second plot
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot the second of the last three columns as a line chart on the first y-axis
ax1.plot(merged_df['month'], merged_df.iloc[:, -3], label=merged_df.columns[-3], color='green')

# Set the labels and title for the first y-axis
ax1.set_xlabel('Date')
ax1.set_ylabel('Number of Cargo')
ax1.set_title('Aircraft Movements and COVID-19 New Cases Over Time')
ax1.legend(loc='upper left')

# Rotate x-axis labels for better readability
ax1.set_xticks(ax1.get_xticks()[::2])  # Show every second label
ax1.set_xticklabels(merged_df['month'][::2], rotation=45, ha='right')

# Create a second y-axis for the new cases
ax2 = ax1.twinx()
ax2.plot(merged_df['month'], merged_df['new_cases'], label='New Cases', color='red')

# Set the label for the second y-axis with unit
ax2.set_ylabel('Number of New Cases')
ax2.legend(loc='upper right')

# Adjust layout to prevent clipping of tick-labels
fig.tight_layout()

# Show the second plot
plt.show()

# Set the figure size for the third plot
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot the third of the last three columns as a line chart on the first y-axis
ax1.plot(merged_df['month'], merged_df.iloc[:, -2], label=merged_df.columns[-2], color='purple')

# Set the labels and title for the first y-axis
ax1.set_xlabel('Date')
ax1.set_ylabel('Number of Mails')
ax1.set_title('Aircraft Movements and COVID-19 New Cases Over Time')
ax1.legend(loc='upper left')

# Rotate x-axis labels for better readability
ax1.set_xticks(ax1.get_xticks()[::2])  # Show every second label
ax1.set_xticklabels(merged_df['month'][::2], rotation=45, ha='right')

# Create a second y-axis for the new cases
ax2 = ax1.twinx()
ax2.plot(merged_df['month'], merged_df['new_cases'], label='New Cases', color='red')

# Set the label for the second y-axis with unit
ax2.set_ylabel('Number of New Cases')
ax2.legend(loc='upper right')

# Adjust layout to prevent clipping of tick-labels
fig.tight_layout()

# Show the third plot
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Adding Vaccination and Death Data for Correlation Analysis¶

In this section, we will incorporate vaccination and COVID-19 death data to explore potential correlations with aviation-related variables. The steps include:

  1. Data Integration: Merge the monthly vaccination and death data with the existing dataset.
  2. Correlation Calculation: Calculate the correlation coefficients between the number of new COVID-19 cases, vaccination data, death data, and aviation-related variables.
  3. Analysis: Analyze the correlation coefficients to understand the relationships between these variables.

The variables analyzed include:

  • Aviation-related Variables:

    • Local flights (number)
    • Cross-country flights (number)
    • Total passengers (number)
    • Total cargo (ton)
    • Total mail (ton)
  • COVID-19 Related Variables:

    • New COVID-19 cases
    • Monthly deaths
    • Monthly vaccinations

By examining these correlations, we aim to uncover insights into how the pandemic and vaccination efforts have impacted aviation activities.

In [5]:
import pandas as pd

# Read the new_deaths.csv file
new_deaths_df = pd.read_csv('new_deaths.csv')

# Assume new_deaths_df has a 'date' column and a 'World' column
# Convert the 'date' column to datetime type
new_deaths_df['date'] = pd.to_datetime(new_deaths_df['date'])

# Aggregate the 'World' column data by month
new_deaths_df['month'] = new_deaths_df['date'].dt.to_period('M')
monthly_deaths = new_deaths_df.groupby('month')['World'].sum().reset_index()

# Read the vaccinations.csv file
vaccinations_df = pd.read_csv('vaccinations.csv')

# Assume vaccinations_df has a 'date' column and a 'people_fully_vaccinated' column
# Convert the 'date' column to datetime type
vaccinations_df['date'] = pd.to_datetime(vaccinations_df['date'])

# Aggregate the 'people_fully_vaccinated' column data by month
vaccinations_df['month'] = vaccinations_df['date'].dt.to_period('M')
monthly_vaccinations = vaccinations_df.groupby('month')['people_fully_vaccinated'].sum().reset_index()

# Display results
print("Monthly Deaths:")
print(monthly_deaths)
print("\nMonthly Vaccinations:")
print(monthly_vaccinations)
Monthly Deaths:
      month   World
0   2020-01      62
1   2020-02    2409
2   2020-03   35799
3   2020-04  178886
4   2020-05  184334
5   2020-06  144336
6   2020-07  162506
7   2020-08  219379
8   2020-09  160002
9   2020-10  156799
10  2020-11  323684
11  2020-12  329388
12  2021-01  484000
13  2021-02  303230
14  2021-03  253439
15  2021-04  333155
16  2021-05  440629
17  2021-06  258619
18  2021-07  234213
19  2021-08  345972
20  2021-09  246781
21  2021-10  247085
22  2021-11  204631
23  2021-12  197605
24  2022-01  268246
25  2022-02  281424
26  2022-03  175992
27  2022-04   85772
28  2022-05   61087
29  2022-06   38226
30  2022-07   72771
31  2022-08   65628
32  2022-09   46160
33  2022-10   55618
34  2022-11   40252
35  2022-12   57961
36  2023-01   85959
37  2023-02   96106
38  2023-03   26682
39  2023-04   26841
40  2023-05   15372
41  2023-06    8226
42  2023-07    5262
43  2023-08    6625
44  2023-09    8827
45  2023-10   13988
46  2023-11   10707
47  2023-12   18055
48  2024-01   15004
49  2024-02    8358
50  2024-03    6746
51  2024-04    3083
52  2024-05    2158
53  2024-06    2914
54  2024-07    3180
55  2024-08     815

Monthly Vaccinations:
      month  people_fully_vaccinated
0   2020-12                   409535
1   2021-01                141372994
2   2021-02                942142919
3   2021-03               3043599106
4   2021-04               6267856933
5   2021-05              11480569413
6   2021-06              16257768616
7   2021-07              24284636924
8   2021-08              49838506680
9   2021-09              72667805659
10  2021-10              88127833652
11  2021-11              96206928715
12  2021-12             113851983049
13  2022-01             124806567896
14  2022-02             119568424182
15  2022-03             138526902254
16  2022-04             138146015385
17  2022-05             145249764908
18  2022-06             142597007963
19  2022-07             149516238988
20  2022-08             151522033137
21  2022-09             147860467526
22  2022-10             153939998691
23  2022-11             149940307133
24  2022-12             156023546625
25  2023-01             156858820814
26  2023-02             142424127265
27  2023-03             158265330200
28  2023-04             153685769644
29  2023-05             159240057026
30  2023-06             154537463204
31  2023-07             159908654197
32  2023-08             160226886916
33  2023-09             155153472297
34  2023-10             160434931197
35  2023-11             155294714275
36  2023-12             160488230121
37  2024-01             160515237690
38  2024-02             150159786782
39  2024-03             160515808075
40  2024-04             155338154540
41  2024-05             160516197594
42  2024-06             155338272516
43  2024-07             160516224691
44  2024-08              72491200789

Correlation with new cases Deaths and Vaccinations¶

In this section, we calculate the correlation coefficient between each variable and the number of deaths as well as the number of vaccinations and new cases. The variables analyzed include:

  • Local flights (number)
  • Cross-country flights (number)
  • Total passengers (number)
  • Total cargo (ton)
  • Total mail (ton)

These coefficients indicate the strength and direction of the linear relationship between each variable and the number of deaths as well as the number of vaccinations.

In [6]:
# Convert 'month' column to string format for merging
monthly_deaths['month'] = monthly_deaths['month'].astype(str)
monthly_vaccinations['month'] = monthly_vaccinations['month'].astype(str)

# Merge monthly_deaths and monthly_vaccinations with merged_df on 'month'
merged_df = pd.merge(merged_df, monthly_deaths, on='month', how='left', suffixes=('', '_deaths'))
merged_df = pd.merge(merged_df, monthly_vaccinations, on='month', how='left', suffixes=('', '_vaccinations'))

# Rename the columns for clarity
merged_df.rename(columns={'World': 'monthly_deaths', 'people_fully_vaccinated': 'monthly_vaccinations'}, inplace=True)
# Fill all NaN values in merged_df with 0
merged_df.fillna(0, inplace=True)
In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Select columns of interest
columns_of_interest = [
    'Local flights (number)', 
    'Cross-country flights (number)',
    'Total passengers (number)',
    'Total cargo (ton)',
    'Total mail (ton)'
]

# Add pandemic-related variables
pandemic_related_columns = ['new_cases', 'monthly_deaths', 'monthly_vaccinations']

# Combine columns of interest and pandemic-related columns
all_columns = columns_of_interest + pandemic_related_columns

# Calculate correlation matrix
correlation_matrix = merged_df[all_columns].corr()

# Plot heatmap of correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()

# Plot pairplot with regression lines
pairplot_data = merged_df[all_columns]
sns.pairplot(pairplot_data, kind='reg', plot_kws={'line_kws':{'color':'red'}})
plt.suptitle('Scatter Plots and Regression Lines Matrix', y=1.02)
plt.show()
No description has been provided for this image
No description has been provided for this image

Plot the five aviation-related variables along with the two newly imported COVID-19 data variables (monthly deaths and monthly vaccinations). The plots will help visualize the trends and relationships between these variables over time.¶

  1. Local flights (number)
  2. Cross-country flights (number)
  3. Total passengers (number)
  4. Total cargo (ton)
  5. Total mail (ton)
  6. Monthly deaths
  7. Monthly vaccinations
In [8]:
import matplotlib.pyplot as plt

# Define the variables to plot
variables_to_plot = [
    'Local flights (number)', 
    'Cross-country flights (number)',
    'Total passengers (number)',
    'Total cargo (ton)',
    'Total mail (ton)'
]

# Create figure and axis
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot the first two variables
ax1.plot(merged_df['month'], merged_df[variables_to_plot[0]], label=variables_to_plot[0], color='blue')
ax1.plot(merged_df['month'], merged_df[variables_to_plot[1]], label=variables_to_plot[1], color='green')
ax1.set_xlabel('Month')
ax1.set_ylabel('Number of Flights', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.set_xticks(ax1.get_xticks()[::3])  # Show every third label
plt.xticks(rotation=45)

# Create a second y-axis for monthly deaths
ax2 = ax1.twinx()
ax2.plot(merged_df['month'], merged_df['monthly_deaths'], label='Monthly Deaths', color='red')
ax2.set_ylabel('Monthly Deaths', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Set the title
plt.title('Comparison of Variables and Monthly Deaths Over Time')

# Show the legend
fig.tight_layout()
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
plt.rcParams.update({'xtick.labelsize': 10})  # Update the font size for x-axis labels

# Show the plot
plt.show()

# Plot each remaining variable with monthly deaths
for variable in variables_to_plot[2:]:
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # Plot the variable
    ax1.plot(merged_df['month'], merged_df[variable], label=variable, color='blue')
    ax1.set_xlabel('Month')
    ax1.set_ylabel(variable, color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')
    ax1.set_xticks(ax1.get_xticks()[::3])  # Show every third label
    plt.xticks(rotation=45)

    # Create a second y-axis for monthly deaths
    ax2 = ax1.twinx()
    ax2.plot(merged_df['month'], merged_df['monthly_deaths'], label='Monthly Deaths', color='red')
    ax2.set_ylabel('Monthly Deaths', color='red')
    ax2.tick_params(axis='y', labelcolor='red')

    # Set the title
    plt.title(f'Comparison of {variable} and Monthly Deaths Over Time')

    # Show the legend
    fig.tight_layout()
    fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
    plt.rcParams.update({'xtick.labelsize': 10})  # Update the font size for x-axis labels

    # Show the plot
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [9]:
# Plot the first two variables with monthly vaccinations
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot the first variable
ax1.plot(merged_df['month'], merged_df[variables_to_plot[0]], label=variables_to_plot[0], color='blue')
ax1.plot(merged_df['month'], merged_df[variables_to_plot[1]], label=variables_to_plot[1], color='green')
ax1.set_xlabel('Month')
ax1.set_ylabel('Number of Flights', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.set_xticks(ax1.get_xticks()[::3])  # Show every third label
plt.xticks(rotation=45)

# Create a second y-axis for monthly vaccinations
ax2 = ax1.twinx()
ax2.plot(merged_df['month'], merged_df['monthly_vaccinations'], label='Monthly Vaccinations', color='red')
ax2.set_ylabel('Monthly Vaccinations', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Set the title
plt.title('Comparison of Variables and Monthly Vaccinations Over Time')

# Show the legend
fig.tight_layout()
fig.legend(loc='upper left', bbox_to_anchor=(0.1,0.9))
plt.rcParams.update({'xtick.labelsize': 10})  # Update the font size for x-axis labels

# Show the plot
plt.show()

# Plot each of the last three variables with monthly vaccinations
for variable in variables_to_plot[2:]:
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # Plot the variable
    ax1.plot(merged_df['month'], merged_df[variable], label=variable, color='blue')
    ax1.set_xlabel('Month')
    ax1.set_ylabel(variable, color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')
    ax1.set_xticks(ax1.get_xticks()[::3])  # Show every third label
    plt.xticks(rotation=45)

    # Create a second y-axis for monthly vaccinations
    ax2 = ax1.twinx()
    ax2.plot(merged_df['month'], merged_df['monthly_vaccinations'], label='Monthly Vaccinations', color='red')
    ax2.set_ylabel('Monthly Vaccinations', color='red')
    ax2.tick_params(axis='y', labelcolor='red')

    # Set the title
    plt.title(f'Comparison of {variable} and Monthly Vaccinations Over Time')

    # Show the legend
    fig.tight_layout()
    fig.legend(loc='upper left', bbox_to_anchor=(0.1,0.9))
    plt.rcParams.update({'xtick.labelsize': 10})  # Update the font size for x-axis labels

    # Show the plot
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Attempting Multiple Linear Regression¶

In this section, we will attempt to use a Multiple Linear Regression model to predict various aviation-related variables based on the number of new COVID-19 cases, monthly deaths, and monthly vaccinations. The variables we will analyze include:

  • Local flights (number)
  • Cross-country flights (number)
  • Total passengers (number)
  • Total cargo (ton)
  • Total mail (ton)

We will split the data into training and testing sets, fit the Multiple Linear Regression model, and evaluate its performance using metrics such as Mean Squared Error (MSE) and R-squared (R²). Additionally, we will visualize the actual vs. predicted values for each variable.

In [10]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import seaborn as sns

# Function to calculate additional evaluation metrics
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, r2

# Prepare data
X = merged_df[['new_cases', 'monthly_deaths', 'monthly_vaccinations']]  # Independent variables

variables = [
    'Local flights (number)', 
    'Cross-country flights (number)',
    'Total passengers (number)',
    'Total cargo (ton)',
    'Total mail (ton)'
]
# Variable to save results
regression_results = {}

# Perform multiple linear regression
for variable in variables:
    Y = merged_df[[variable]]  # Dependent variable
    
    # Create linear regression model
    model = LinearRegression()
    
    # Fit the model
    model.fit(X, Y)
    
    # Predict
    y_pred = model.predict(X)
    
    # Calculate evaluation metrics
    mse, r2 = evaluate_model(Y, y_pred)
    
    # Save regression coefficients and intercept
    regression_results[variable] = {
        'coefficient': model.coef_[0],
        'intercept': model.intercept_[0],
        'r_squared': model.score(X, Y),
        'mse': mse,
        'r2': r2  
    }
    
    # Residual analysis
    residuals = Y - y_pred
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True)
    plt.title(f'Residuals Distribution for {variable}')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.show()
    
    # Q-Q plot for residuals
    sm.qqplot(residuals, line='45')
    plt.title(f'Q-Q Plot for {variable}')
    plt.show()
    
    # Calculate VIF for each feature
    vif_data = pd.DataFrame()
    vif_data['feature'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    print(f'Variance Inflation Factor (VIF) for {variable}:')
    print(vif_data)
    
    # Significance test using statsmodels
    X_with_const = sm.add_constant(X)
    sm_model = sm.OLS(Y, X_with_const).fit()
    print(sm_model.summary())

# Display results
print("Regression Results:")
for variable, result in regression_results.items():
    print(f"{variable}: {result}")
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Local flights (number):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                              OLS Regression Results                              
==================================================================================
Dep. Variable:     Local flights (number)   R-squared:                       0.302
Model:                                OLS   Adj. R-squared:                  0.269
Method:                     Least Squares   F-statistic:                     9.230
Date:                    Wed, 09 Oct 2024   Prob (F-statistic):           3.70e-05
Time:                            10:23:28   Log-Likelihood:                -603.24
No. Observations:                      68   AIC:                             1214.
Df Residuals:                          64   BIC:                             1223.
Df Model:                               3                                         
Covariance Type:                nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 4576.6551    423.390     10.810      0.000    3730.836    5422.475
new_cases            -2.325e-05   1.56e-05     -1.487      0.142   -5.45e-05    7.99e-06
monthly_deaths           0.0060      0.002      2.681      0.009       0.002       0.011
monthly_vaccinations   1.82e-08   3.53e-09      5.151      0.000    1.11e-08    2.53e-08
==============================================================================
Omnibus:                        2.442   Durbin-Watson:                   0.815
Prob(Omnibus):                  0.295   Jarque-Bera (JB):                2.055
Skew:                          -0.426   Prob(JB):                        0.358
Kurtosis:                       3.008   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Cross-country flights (number):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                                  OLS Regression Results                                  
==========================================================================================
Dep. Variable:     Cross-country flights (number)   R-squared:                       0.616
Model:                                        OLS   Adj. R-squared:                  0.598
Method:                             Least Squares   F-statistic:                     34.27
Date:                            Wed, 09 Oct 2024   Prob (F-statistic):           2.49e-13
Time:                                    10:23:28   Log-Likelihood:                -707.39
No. Observations:                              68   AIC:                             1423.
Df Residuals:                                  64   BIC:                             1432.
Df Model:                                       3                                         
Covariance Type:                        nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 4.426e+04   1958.444     22.602      0.000    4.04e+04    4.82e+04
new_cases             3.891e-05   7.23e-05      0.538      0.592      -0.000       0.000
monthly_deaths          -0.0747      0.010     -7.160      0.000      -0.095      -0.054
monthly_vaccinations  3.192e-08   1.63e-08      1.953      0.055   -7.37e-10    6.46e-08
==============================================================================
Omnibus:                        9.164   Durbin-Watson:                   0.494
Prob(Omnibus):                  0.010   Jarque-Bera (JB):                9.689
Skew:                          -0.655   Prob(JB):                      0.00787
Kurtosis:                       4.305   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Total passengers (number):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                                OLS Regression Results                               
=====================================================================================
Dep. Variable:     Total passengers (number)   R-squared:                       0.685
Model:                                   OLS   Adj. R-squared:                  0.670
Method:                        Least Squares   F-statistic:                     46.29
Date:                       Wed, 09 Oct 2024   Prob (F-statistic):           4.98e-16
Time:                               10:23:28   Log-Likelihood:                -1052.4
No. Observations:                         68   AIC:                             2113.
Df Residuals:                             64   BIC:                             2122.
Df Model:                                  3                                         
Covariance Type:                   nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 5.683e+06   3.13e+05     18.151      0.000    5.06e+06    6.31e+06
new_cases                0.0041      0.012      0.355      0.724      -0.019       0.027
monthly_deaths         -13.9938      1.667     -8.394      0.000     -17.324     -10.663
monthly_vaccinations  5.055e-06   2.61e-06      1.934      0.058   -1.66e-07    1.03e-05
==============================================================================
Omnibus:                        1.606   Durbin-Watson:                   0.547
Prob(Omnibus):                  0.448   Jarque-Bera (JB):                0.988
Skew:                          -0.260   Prob(JB):                        0.610
Kurtosis:                       3.280   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Total cargo (ton):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Total cargo (ton)   R-squared:                       0.352
Model:                            OLS   Adj. R-squared:                  0.322
Method:                 Least Squares   F-statistic:                     11.59
Date:                Wed, 09 Oct 2024   Prob (F-statistic):           3.68e-06
Time:                        10:23:29   Log-Likelihood:                -728.62
No. Observations:                  68   AIC:                             1465.
Df Residuals:                      64   BIC:                             1474.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1.343e+05   2676.366     50.164      0.000    1.29e+05     1.4e+05
new_cases            -2.546e-05   9.88e-05     -0.258      0.798      -0.000       0.000
monthly_deaths           0.0394      0.014      2.765      0.007       0.011       0.068
monthly_vaccinations -6.909e-08   2.23e-08     -3.093      0.003   -1.14e-07   -2.45e-08
==============================================================================
Omnibus:                        4.960   Durbin-Watson:                   0.979
Prob(Omnibus):                  0.084   Jarque-Bera (JB):                5.467
Skew:                          -0.261   Prob(JB):                       0.0650
Kurtosis:                       4.287   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Total mail (ton):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       Total mail (ton)   R-squared:                       0.729
Model:                            OLS   Adj. R-squared:                  0.716
Method:                 Least Squares   F-statistic:                     57.38
Date:                Wed, 09 Oct 2024   Prob (F-statistic):           3.98e-18
Time:                        10:23:29   Log-Likelihood:                -478.97
No. Observations:                  68   AIC:                             965.9
Df Residuals:                      64   BIC:                             974.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1620.3664     68.096     23.795      0.000    1484.328    1756.404
new_cases             5.644e-06   2.51e-06      2.244      0.028     6.2e-07    1.07e-05
monthly_deaths          -0.0013      0.000     -3.587      0.001      -0.002      -0.001
monthly_vaccinations -7.106e-09   5.68e-10    -12.502      0.000   -8.24e-09   -5.97e-09
==============================================================================
Omnibus:                       19.488   Durbin-Watson:                   0.585
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               37.033
Skew:                          -0.949   Prob(JB):                     9.09e-09
Kurtosis:                       6.077   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
Regression Results:
Local flights (number): {'coefficient': array([-2.32513874e-05,  6.04453098e-03,  1.82030823e-08]), 'intercept': 4576.655107404027, 'r_squared': 0.3019985773967159, 'mse': 2970865.4373463653, 'r2': 0.3019985773967159}
Cross-country flights (number): {'coefficient': array([ 3.89073165e-05, -7.46610032e-02,  3.19174187e-08]), 'intercept': 44263.804356110035, 'r_squared': 0.6163375491395153, 'mse': 63565748.32707952, 'r2': 0.6163375491395153}
Total passengers (number): {'coefficient': array([ 4.09967948e-03, -1.39937630e+01,  5.05481230e-06]), 'intercept': 5683170.126069027, 'r_squared': 0.6845344705977535, 'mse': 1624713261638.5752, 'r2': 0.6845344705977535}
Total cargo (ton): {'coefficient': array([-2.54569927e-05,  3.93956157e-02, -6.90902181e-08]), 'intercept': 134256.06525329815, 'r_squared': 0.35196613508834496, 'mse': 118711277.16952589, 'r2': 0.35196613508834496}
Total mail (ton): {'coefficient': array([ 5.64430498e-06, -1.30036302e-03, -7.10578436e-09]), 'intercept': 1620.3663999123273, 'r_squared': 0.7289793568513645, 'mse': 76850.83090352482, 'r2': 0.7289793568513645}

Attempting Multiple Linear Regression with Log Transformation¶

In this section, we will attempt to use a Multiple Linear Regression model to predict various aviation-related variables based on the number of new COVID-19 cases, monthly deaths, and monthly vaccinations. We will apply a log transformation to the dependent variables to stabilize variance and make the data more normally distributed.

In [11]:
# Variable to save results
regression_results = {}

# Perform multiple linear regression
for variable in variables:
    Y = merged_df[[variable]]  # Dependent variable
    
    # Apply log transformation to the dependent variable
    Y_transformed = np.log1p(Y)
    
    # Create linear regression model
    model = LinearRegression()
    
    # Fit the model
    model.fit(X, Y_transformed)
    
    # Predict
    y_pred = model.predict(X)
    
    # Inverse log transformation of predictions
    y_pred_inverse = np.expm1(y_pred)
    
    # Calculate evaluation metrics
    mse, r2 = evaluate_model(Y, y_pred_inverse)
    
    # Save regression coefficients and intercept
    regression_results[variable] = {
        'coefficient': model.coef_[0],
        'intercept': model.intercept_[0],
        'r_squared': model.score(X, Y_transformed),
        'mse': mse,
        'r2': r2  
    }
    
    # Residual analysis
    residuals = Y - y_pred_inverse
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True)
    plt.title(f'Residuals Distribution for {variable}')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.show()
    
    # Q-Q plot for residuals
    sm.qqplot(residuals, line='45')
    plt.title(f'Q-Q Plot for {variable}')
    plt.show()
    
    # Calculate VIF for each feature
    vif_data = pd.DataFrame()
    vif_data['feature'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    print(f'Variance Inflation Factor (VIF) for {variable}:')
    print(vif_data)
    
    # Significance test using statsmodels
    X_with_const = sm.add_constant(X)
    sm_model = sm.OLS(Y_transformed, X_with_const).fit()
    print(sm_model.summary())

# Display results
print("Regression Results:")
for variable, result in regression_results.items():
    print(f"{variable}: {result}")
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Local flights (number):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                              OLS Regression Results                              
==================================================================================
Dep. Variable:     Local flights (number)   R-squared:                       0.284
Model:                                OLS   Adj. R-squared:                  0.250
Method:                     Least Squares   F-statistic:                     8.458
Date:                    Wed, 09 Oct 2024   Prob (F-statistic):           8.16e-05
Time:                            10:23:30   Log-Likelihood:                -18.819
No. Observations:                      68   AIC:                             45.64
Df Residuals:                          64   BIC:                             54.52
Df Model:                               3                                         
Covariance Type:                nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    8.3960      0.078    107.109      0.000       8.239       8.553
new_cases             -3.13e-09   2.89e-09     -1.081      0.284   -8.91e-09    2.65e-09
monthly_deaths        8.132e-07   4.17e-07      1.948      0.056   -2.06e-08    1.65e-06
monthly_vaccinations  3.199e-12   6.54e-13      4.890      0.000    1.89e-12    4.51e-12
==============================================================================
Omnibus:                       13.361   Durbin-Watson:                   0.861
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               14.672
Skew:                          -0.949   Prob(JB):                     0.000652
Kurtosis:                       4.255   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Cross-country flights (number):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                                  OLS Regression Results                                  
==========================================================================================
Dep. Variable:     Cross-country flights (number)   R-squared:                       0.523
Model:                                        OLS   Adj. R-squared:                  0.500
Method:                             Least Squares   F-statistic:                     23.35
Date:                            Wed, 09 Oct 2024   Prob (F-statistic):           2.51e-10
Time:                                    10:23:30   Log-Likelihood:                -18.819
No. Observations:                              68   AIC:                             45.64
Df Residuals:                                  64   BIC:                             54.52
Df Model:                                       3                                         
Covariance Type:                        nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   10.6160      0.078    135.428      0.000      10.459      10.773
new_cases             3.176e-09   2.89e-09      1.097      0.277   -2.61e-09    8.96e-09
monthly_deaths       -2.448e-06   4.17e-07     -5.866      0.000   -3.28e-06   -1.61e-06
monthly_vaccinations  1.314e-12   6.54e-13      2.009      0.049    7.48e-15    2.62e-12
==============================================================================
Omnibus:                       47.303   Durbin-Watson:                   0.586
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              191.474
Skew:                          -2.038   Prob(JB):                     2.64e-42
Kurtosis:                      10.139   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Total passengers (number):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                                OLS Regression Results                               
=====================================================================================
Dep. Variable:     Total passengers (number)   R-squared:                       0.556
Model:                                   OLS   Adj. R-squared:                  0.535
Method:                        Least Squares   F-statistic:                     26.68
Date:                       Wed, 09 Oct 2024   Prob (F-statistic):           2.60e-11
Time:                               10:23:30   Log-Likelihood:                -60.596
No. Observations:                         68   AIC:                             129.2
Df Residuals:                             64   BIC:                             138.1
Df Model:                                  3                                         
Covariance Type:                   nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   15.3232      0.145    105.749      0.000      15.034      15.613
new_cases              7.14e-09   5.35e-09      1.334      0.187   -3.55e-09    1.78e-08
monthly_deaths       -4.833e-06   7.71e-07     -6.264      0.000   -6.37e-06   -3.29e-06
monthly_vaccinations  2.672e-12   1.21e-12      2.209      0.031    2.56e-13    5.09e-12
==============================================================================
Omnibus:                       47.083   Durbin-Watson:                   0.607
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              190.666
Skew:                          -2.025   Prob(JB):                     3.96e-42
Kurtosis:                      10.134   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Total cargo (ton):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Total cargo (ton)   R-squared:                       0.342
Model:                            OLS   Adj. R-squared:                  0.312
Method:                 Least Squares   F-statistic:                     11.11
Date:                Wed, 09 Oct 2024   Prob (F-statistic):           5.82e-06
Time:                        10:23:30   Log-Likelihood:                 73.625
No. Observations:                  68   AIC:                            -139.2
Df Residuals:                      64   BIC:                            -130.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   11.8045      0.020    586.413      0.000      11.764      11.845
new_cases            -1.131e-10   7.43e-10     -0.152      0.880    -1.6e-09    1.37e-09
monthly_deaths        2.755e-07   1.07e-07      2.571      0.012    6.14e-08     4.9e-07
monthly_vaccinations -5.303e-13   1.68e-13     -3.156      0.002   -8.66e-13   -1.95e-13
==============================================================================
Omnibus:                        9.679   Durbin-Watson:                   0.961
Prob(Omnibus):                  0.008   Jarque-Bera (JB):               12.817
Skew:                          -0.555   Prob(JB):                      0.00165
Kurtosis:                       4.815   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
No description has been provided for this image
Variance Inflation Factor (VIF) for Total mail (ton):
                feature       VIF
0             new_cases  2.204794
1        monthly_deaths  1.755949
2  monthly_vaccinations  1.380977
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       Total mail (ton)   R-squared:                       0.737
Model:                            OLS   Adj. R-squared:                  0.725
Method:                 Least Squares   F-statistic:                     59.94
Date:                Wed, 09 Oct 2024   Prob (F-statistic):           1.44e-18
Time:                        10:23:31   Log-Likelihood:                -8.3790
No. Observations:                  68   AIC:                             24.76
Df Residuals:                      64   BIC:                             33.64
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    7.3240      0.067    108.936      0.000       7.190       7.458
new_cases             6.543e-09   2.48e-09      2.635      0.011    1.58e-09    1.15e-08
monthly_deaths       -8.365e-07   3.58e-07     -2.337      0.023   -1.55e-06   -1.21e-07
monthly_vaccinations -7.042e-12   5.61e-13    -12.549      0.000   -8.16e-12   -5.92e-12
==============================================================================
Omnibus:                       41.198   Durbin-Watson:                   0.653
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              137.356
Skew:                          -1.812   Prob(JB):                     1.49e-30
Kurtosis:                       8.945   Cond. No.                     2.08e+11
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.08e+11. This might indicate that there are
strong multicollinearity or other numerical problems.
Regression Results:
Local flights (number): {'coefficient': array([-3.12984352e-09,  8.13176266e-07,  3.19923619e-12]), 'intercept': 8.396041463691304, 'r_squared': 0.28390398588775545, 'mse': 3134092.349831029, 'r2': 0.2636486017669869}
Cross-country flights (number): {'coefficient': array([ 3.17622474e-09, -2.44811454e-06,  1.31449692e-12]), 'intercept': 10.61598709229182, 'r_squared': 0.5226164654798917, 'mse': 67512634.7352138, 'r2': 0.5925154098197514}
Total passengers (number): {'coefficient': array([ 7.13961929e-09, -4.83278043e-06,  2.67189830e-12]), 'intercept': 15.323238117245891, 'r_squared': 0.555666628308473, 'mse': 2125759904047.612, 'r2': 0.5872477997526018}
Total cargo (ton): {'coefficient': array([-1.13147514e-10,  2.75528887e-07, -5.30279651e-13]), 'intercept': 11.804511557915145, 'r_squared': 0.3423756779044518, 'mse': 119647731.3452087, 'r2': 0.3468541184943884}
Total mail (ton): {'coefficient': array([ 6.54294492e-09, -8.36479209e-07, -7.04151021e-12]), 'intercept': 7.3239705517279345, 'r_squared': 0.7374978311362949, 'mse': 76362.10148584018, 'r2': 0.7307029004949307}
In [12]:
# Prepare data
X = merged_df[['new_cases', 'monthly_deaths', 'monthly_vaccinations']] 

# Iterate over each variable and perform linear regression
for variable in variables:
    Y = merged_df[[variable]]  # Dependent variable
    
    # Create linear regression model
    model = LinearRegression()
    
    # Fit the model
    model.fit(X, Y)
    
    # Predict
    y_pred = model.predict(X)
    
    # Plot actual and predicted values
    plt.figure(figsize=(12, 6))
    plt.plot(merged_df['month'], Y, label='Actual Values', marker='o')
    plt.plot(merged_df['month'], y_pred, label='Predicted Values', marker='x')
    plt.xlabel('Month')
    plt.ylabel(variable)
    plt.title(f'Actual vs Predicted Values for {variable}')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    ax = plt.gca()
    ax.set_xticks(ax.get_xticks()[::2])  # Show every second label
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Attempting Random Forest Model¶

In this section, we will attempt to use a Random Forest model to predict various aviation-related variables based on the number of new COVID-19 cases. The variables we will analyze include:

  • Local flights (number)
  • Cross-country flights (number)
  • Total passengers (number)
  • Total cargo (ton)
  • Total mail (ton)

We will split the data into training and testing sets, fit the Random Forest model, and evaluate its performance using metrics such as Mean Squared Error (MSE) and R-squared (R²). Additionally, we will visualize the actual vs. predicted values for each variable.

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Define the aviation-related variables
aviation_vars = [
    'Local flights (number)', 
    'Cross-country flights (number)',
    'Total passengers (number)',
    'Total cargo (ton)',
    'Total mail (ton)'
]

# Define the pandemic-related variables
covid_vars = ['new_cases', 'monthly_deaths', 'monthly_vaccinations']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    merged_df[covid_vars],
    merged_df[aviation_vars],
    test_size=0.4,
    random_state=42
)

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, Y_train)

# Predict on the test data
Y_test_pred = rf_model.predict(X_test)

# Evaluate the model on the test data
test_mse = mean_squared_error(Y_test, Y_test_pred)
test_r2 = r2_score(Y_test, Y_test_pred)

print(f"Test Mean Squared Error: {test_mse}")
print(f"Test R^2 Score: {test_r2}")

# Print feature importances
feature_importances = rf_model.feature_importances_
features = covid_vars
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
print(importance_df)
Test Mean Squared Error: 215993945235.1363
Test R^2 Score: 0.6337518052163386
                Feature  Importance
0             new_cases    0.028951
1        monthly_deaths    0.839350
2  monthly_vaccinations    0.131699
In [14]:
import matplotlib.pyplot as plt

# Define the aviation-related variables
aviation_vars = [
    'Local flights (number)', 
    'Cross-country flights (number)',
    'Total passengers (number)',
    'Total cargo (ton)',
    'Total mail (ton)'
]

# Plot actual vs predicted values for each variable
for i, variable in enumerate(aviation_vars):
    plt.figure(figsize=(12, 6))
    plt.plot(merged_df['month'][X_test.index], Y_test[variable], label='Actual Values', marker='o')
    plt.plot(merged_df['month'][X_test.index], Y_test_pred[:, i], label='Predicted Values', marker='x')
    plt.xlabel('Month')
    plt.ylabel(variable)
    plt.title(f'Actual vs Predicted Values for {variable}')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    ax = plt.gca()
    ax.set_xticks(ax.get_xticks()[::2])  # Show every second label
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Complete Time Series Analysis Steps¶

Stationarity Testing: - Use tests like the Augmented Dickey-Fuller (ADF) test or the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test to check the stationarity of the time series. - If the time series is not stationary, make it stationary through differencing, log transformation, etc.

Model Selection and Training: - Choose an appropriate time series model, such as ARIMA, SARIMA, Holt-Winters, etc. - Fit the model using training data and adjust model parameters for the best fit.

Model Evaluation: - Evaluate the model's predictive performance using test data. - Calculate evaluation metrics such as Mean Squared Error (MSE), Root Mean Squared Error (RMSE), Mean Absolute Error (MAE), etc.

Forecasting and Validation: - Use the trained model to forecast future data. - Compare the forecasted results with actual data to validate the model's accuracy.

In [17]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Define the dependent and independent variables
dependent_vars = [
    'Local flights (number)', 
    'Cross-country flights (number)',
    'Total passengers (number)',
    'Total cargo (ton)',
    'Total mail (ton)'
]

independent_vars = ['new_cases', 'monthly_deaths', 'monthly_vaccinations']

# Function to perform stationarity test
def test_stationarity(timeseries):
    adf_test = adfuller(timeseries, autolag='AIC')
    kpss_test = kpss(timeseries, regression='c')
    return adf_test, kpss_test

# Function to fit ARIMA model
def fit_arima_model(timeseries, order):
    model = ARIMA(timeseries, order=order)
    model_fit = model.fit()
    return model_fit

# Function to evaluate model
def evaluate_model(model_fit, test_data):
    predictions = model_fit.forecast(steps=len(test_data))
    mse = mean_squared_error(test_data, predictions)
    rmse = np.sqrt(mse)
    return mse, rmse, predictions

# Perform time series analysis for each variable
for var in dependent_vars + independent_vars:
    print(f"Analyzing {var}...")
    
    # Extract the time series
    timeseries = merged_df[var]
    
    # Decompose the time series
    decomposition = seasonal_decompose(timeseries, model='additive', period=12)
    trend = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid
    
    # Plot the decomposed components
    plt.figure(figsize=(12, 8))
    plt.subplot(411)
    plt.plot(timeseries, label='Original')
    plt.legend(loc='best')
    plt.subplot(412)
    plt.plot(trend, label='Trend')
    plt.legend(loc='best')
    plt.subplot(413)
    plt.plot(seasonal, label='Seasonality')
    plt.legend(loc='best')
    plt.subplot(414)
    plt.plot(residual, label='Residuals')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()
    
    # Test for stationarity
    adf_test, kpss_test = test_stationarity(timeseries)
    print(f"ADF Test for {var}: {adf_test}")
    print(f"KPSS Test for {var}: {kpss_test}")
    
    # If not stationary, difference the series
    if adf_test[1] > 0.05 or kpss_test[1] < 0.05:
        timeseries = timeseries.diff().dropna()
        print(f"{var} is not stationary. Differencing applied.")
    
    # Fit ARIMA model
    model_fit = fit_arima_model(timeseries, order=(1, 1, 1))
    print(model_fit.summary())
    
    # Evaluate model
    train_size = int(len(timeseries) * 0.8)
    train, test = timeseries[:train_size], timeseries[train_size:]
    mse, rmse, predictions = evaluate_model(model_fit, test)
    print(f"Evaluation for {var} - MSE: {mse}, RMSE: {rmse}")
    
    # Plot the results
    plt.figure(figsize=(12, 6))
    plt.plot(train, label='Train')
    plt.plot(test.index, test, label='Test')
    plt.plot(test.index, predictions, label='Predicted')
    plt.title(f'ARIMA Model for {var}')
    plt.legend()
    plt.show()
    
    # Residual analysis
    residuals = model_fit.resid
    plt.figure(figsize=(12, 6))
    plt.scatter(test.index, residuals[-len(test):], label='Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    plt.title(f'Residuals over Time for {var}')
    plt.xlabel('Time')
    plt.ylabel('Residuals')
    plt.legend()
    plt.show()
    
    # ACF and PACF plots
    plt.figure(figsize=(12, 6))
    plt.subplot(121)
    plt.plot(acf(residuals, nlags=20))
    plt.axhline(y=0, linestyle='--', color='gray')
    plt.axhline(y=1.96/np.sqrt(len(residuals)), linestyle='--', color='gray')
    plt.axhline(y=-1.96/np.sqrt(len(residuals)), linestyle='--', color='gray')
    plt.title(f'ACF of Residuals for {var}')
    
    plt.subplot(122)
    plt.plot(pacf(residuals, nlags=20))
    plt.axhline(y=0, linestyle='--', color='gray')
    plt.axhline(y=1.96/np.sqrt(len(residuals)), linestyle='--', color='gray')
    plt.axhline(y=-1.96/np.sqrt(len(residuals)), linestyle='--', color='gray')
    plt.title(f'PACF of Residuals for {var}')
    
    plt.tight_layout()
    plt.show()
Analyzing Local flights (number)...
No description has been provided for this image
ADF Test for Local flights (number): (-3.177268146622313, 0.021338773288993453, 0, 67, {'1%': -3.5319549603840894, '5%': -2.905755128523123, '10%': -2.5903569458676765}, 978.304488428362)
KPSS Test for Local flights (number): (0.7383547264004816, 0.010058661236319853, 4, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
Local flights (number) is not stationary. Differencing applied.
                                 SARIMAX Results                                  
==================================================================================
Dep. Variable:     Local flights (number)   No. Observations:                   67
Model:                     ARIMA(1, 1, 1)   Log Likelihood                -577.625
Date:                    Wed, 09 Oct 2024   AIC                           1161.251
Time:                            10:37:35   BIC                           1167.820
Sample:                                 0   HQIC                          1163.847
                                     - 67                                         
Covariance Type:                      opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0519      0.145     -0.357      0.721      -0.337       0.233
ma.L1         -1.0000      0.108     -9.300      0.000      -1.211      -0.789
sigma2      2.216e+06   4.85e-08   4.57e+13      0.000    2.22e+06    2.22e+06
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):                55.33
Prob(Q):                              0.94   Prob(JB):                         0.00
Heteroskedasticity (H):               1.05   Skew:                             1.11
Prob(H) (two-sided):                  0.92   Kurtosis:                         6.89
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.94e+28. Standard errors may be unstable.
Evaluation for Local flights (number) - MSE: 1636321.526646404, RMSE: 1279.187838687659
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
  return get_prediction_index(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Analyzing Cross-country flights (number)...
No description has been provided for this image
ADF Test for Cross-country flights (number): (-1.7241835129361547, 0.41869744263768666, 9, 58, {'1%': -3.548493559596539, '5%': -2.912836594776334, '10%': -2.594129155766944}, 1115.6069041120265)
KPSS Test for Cross-country flights (number): (0.2928089767684992, 0.1, 5, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
Cross-country flights (number) is not stationary. Differencing applied.
                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:     Cross-country flights (number)   No. Observations:                   67
Model:                             ARIMA(1, 1, 1)   Log Likelihood                -657.689
Date:                            Wed, 09 Oct 2024   AIC                           1321.377
Time:                                    10:37:36   BIC                           1327.946
Sample:                                         0   HQIC                          1323.973
                                             - 67                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.3266      0.072      4.553      0.000       0.186       0.467
ma.L1         -1.0000      0.117     -8.539      0.000      -1.229      -0.770
sigma2      2.525e+07   4.64e-09   5.44e+15      0.000    2.53e+07    2.53e+07
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):                29.91
Prob(Q):                              0.98   Prob(JB):                         0.00
Heteroskedasticity (H):               0.30   Skew:                            -0.72
Prob(H) (two-sided):                  0.01   Kurtosis:                         5.97
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 3.03e+30. Standard errors may be unstable.
Evaluation for Cross-country flights (number) - MSE: 9564508.857202329, RMSE: 3092.654015114256
C:\Users\wxdy\AppData\Local\Temp\ipykernel_10508\1881911332.py:17: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  kpss_test = kpss(timeseries, regression='c')
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
  return get_prediction_index(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Analyzing Total passengers (number)...
No description has been provided for this image
ADF Test for Total passengers (number): (-1.9202860666425614, 0.32257890240449044, 1, 66, {'1%': -3.5335601309235605, '5%': -2.9064436883991434, '10%': -2.590723948576676}, 1665.9118199432837)
KPSS Test for Total passengers (number): (0.2994542170468994, 0.1, 5, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
Total passengers (number) is not stationary. Differencing applied.
                                   SARIMAX Results                                   
=====================================================================================
Dep. Variable:     Total passengers (number)   No. Observations:                   67
Model:                        ARIMA(1, 1, 1)   Log Likelihood                -981.976
Date:                       Wed, 09 Oct 2024   AIC                           1969.952
Time:                               10:37:37   BIC                           1976.521
Sample:                                    0   HQIC                          1972.548
                                        - 67                                         
Covariance Type:                         opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.4257      0.114      3.736      0.000       0.202       0.649
ma.L1         -0.9833      0.166     -5.931      0.000      -1.308      -0.658
sigma2      5.965e+11   2.25e-13   2.66e+24      0.000    5.97e+11    5.97e+11
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):                10.78
Prob(Q):                              0.94   Prob(JB):                         0.00
Heteroskedasticity (H):               0.45   Skew:                            -0.72
Prob(H) (two-sided):                  0.06   Kurtosis:                         4.35
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.09e+39. Standard errors may be unstable.
Evaluation for Total passengers (number) - MSE: 252010296877.79214, RMSE: 502006.2717514515
C:\Users\wxdy\AppData\Local\Temp\ipykernel_10508\1881911332.py:17: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  kpss_test = kpss(timeseries, regression='c')
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
  return get_prediction_index(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Analyzing Total cargo (ton)...
No description has been provided for this image
ADF Test for Total cargo (ton): (-3.264005269162969, 0.016566135199388345, 0, 67, {'1%': -3.5319549603840894, '5%': -2.905755128523123, '10%': -2.5903569458676765}, 1186.122156286719)
KPSS Test for Total cargo (ton): (0.5151772112084883, 0.03824837585394409, 4, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
Total cargo (ton) is not stationary. Differencing applied.
                               SARIMAX Results                                
==============================================================================
Dep. Variable:      Total cargo (ton)   No. Observations:                   67
Model:                 ARIMA(1, 1, 1)   Log Likelihood                -702.866
Date:                Wed, 09 Oct 2024   AIC                           1411.733
Time:                        10:37:38   BIC                           1418.302
Sample:                             0   HQIC                          1414.329
                                 - 67                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.2706      0.136     -1.994      0.046      -0.537      -0.005
ma.L1         -1.0000      0.163     -6.129      0.000      -1.320      -0.680
sigma2      9.836e+07   1.66e-09   5.93e+16      0.000    9.84e+07    9.84e+07
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):                 0.45
Prob(Q):                              0.99   Prob(JB):                         0.80
Heteroskedasticity (H):               0.41   Skew:                             0.14
Prob(H) (two-sided):                  0.04   Kurtosis:                         3.30
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 5.65e+31. Standard errors may be unstable.
Evaluation for Total cargo (ton) - MSE: 58923256.455953255, RMSE: 7676.148543114135
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
  return get_prediction_index(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Analyzing Total mail (ton)...
No description has been provided for this image
ADF Test for Total mail (ton): (-1.4566907985665036, 0.5548167533481055, 8, 59, {'1%': -3.5463945337644063, '5%': -2.911939409384601, '10%': -2.5936515282964665}, 738.2639955746846)
KPSS Test for Total mail (ton): (1.021023933862246, 0.01, 5, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
Total mail (ton) is not stationary. Differencing applied.
                               SARIMAX Results                                
==============================================================================
Dep. Variable:       Total mail (ton)   No. Observations:                   67
Model:                 ARIMA(1, 1, 1)   Log Likelihood                -442.741
Date:                Wed, 09 Oct 2024   AIC                            891.483
Time:                        10:37:39   BIC                            898.052
Sample:                             0   HQIC                           894.079
                                 - 67                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.1104      0.104      1.066      0.286      -0.093       0.313
ma.L1         -0.9984      3.309     -0.302      0.763      -7.484       5.487
sigma2      3.708e+04   1.21e+05      0.307      0.758   -1.99e+05    2.73e+05
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):                19.95
Prob(Q):                              0.96   Prob(JB):                         0.00
Heteroskedasticity (H):               0.10   Skew:                            -0.81
Prob(H) (two-sided):                  0.00   Kurtosis:                         5.15
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Evaluation for Total mail (ton) - MSE: 8264.376796051461, RMSE: 90.90861783159758
C:\Users\wxdy\AppData\Local\Temp\ipykernel_10508\1881911332.py:17: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_test = kpss(timeseries, regression='c')
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
  return get_prediction_index(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Analyzing new_cases...
No description has been provided for this image
ADF Test for new_cases: (-3.671977553426876, 0.004525032776908464, 0, 67, {'1%': -3.5319549603840894, '5%': -2.905755128523123, '10%': -2.5903569458676765}, 2008.2478629183279)
KPSS Test for new_cases: (0.2901776674501048, 0.1, 4, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
                               SARIMAX Results                                
==============================================================================
Dep. Variable:              new_cases   No. Observations:                   68
Model:                 ARIMA(1, 1, 1)   Log Likelihood               -1194.501
Date:                Wed, 09 Oct 2024   AIC                           2395.003
Time:                        10:37:40   BIC                           2401.617
Sample:                             0   HQIC                          2397.620
                                 - 68                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.4593      0.297      1.548      0.122      -0.122       1.041
ma.L1         -0.8410      0.184     -4.580      0.000      -1.201      -0.481
sigma2      1.986e+14   2.94e-15   6.76e+28      0.000    1.99e+14    1.99e+14
===================================================================================
Ljung-Box (L1) (Q):                   0.25   Jarque-Bera (JB):              1010.54
Prob(Q):                              0.62   Prob(JB):                         0.00
Heteroskedasticity (H):              22.45   Skew:                             3.38
Prob(H) (two-sided):                  0.00   Kurtosis:                        20.78
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 2.46e+44. Standard errors may be unstable.
Evaluation for new_cases - MSE: 866762719334.6671, RMSE: 931000.9233801367
C:\Users\wxdy\AppData\Local\Temp\ipykernel_10508\1881911332.py:17: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  kpss_test = kpss(timeseries, regression='c')
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Analyzing monthly_deaths...
No description has been provided for this image
ADF Test for monthly_deaths: (-2.4366970042563367, 0.13164563170113136, 9, 58, {'1%': -3.548493559596539, '5%': -2.912836594776334, '10%': -2.594129155766944}, 1393.5546896908131)
KPSS Test for monthly_deaths: (0.292961883275794, 0.1, 5, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
monthly_deaths is not stationary. Differencing applied.
                               SARIMAX Results                                
==============================================================================
Dep. Variable:         monthly_deaths   No. Observations:                   67
Model:                 ARIMA(1, 1, 1)   Log Likelihood                -820.456
Date:                Wed, 09 Oct 2024   AIC                           1646.913
Time:                        10:37:41   BIC                           1653.482
Sample:                             0   HQIC                          1649.509
                                 - 67                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.1251      0.145     -0.863      0.388      -0.409       0.159
ma.L1         -0.9750      0.141     -6.915      0.000      -1.251      -0.699
sigma2      4.375e+09   1.64e-11   2.67e+20      0.000    4.38e+09    4.38e+09
===================================================================================
Ljung-Box (L1) (Q):                   0.16   Jarque-Bera (JB):                18.24
Prob(Q):                              0.69   Prob(JB):                         0.00
Heteroskedasticity (H):               0.12   Skew:                            -0.24
Prob(H) (two-sided):                  0.00   Kurtosis:                         5.53
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 8.28e+35. Standard errors may be unstable.
Evaluation for monthly_deaths - MSE: 13670438.903545598, RMSE: 3697.3556636528215
C:\Users\wxdy\AppData\Local\Temp\ipykernel_10508\1881911332.py:17: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

  kpss_test = kpss(timeseries, regression='c')
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
  return get_prediction_index(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Analyzing monthly_vaccinations...
No description has been provided for this image
ADF Test for monthly_vaccinations: (-1.414047167228727, 0.5755235110168069, 2, 65, {'1%': -3.5352168748293127, '5%': -2.9071540828402367, '10%': -2.5911025443786984}, 2775.058134444615)
KPSS Test for monthly_vaccinations: (1.0728260309850368, 0.01, 5, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
monthly_vaccinations is not stationary. Differencing applied.
                                SARIMAX Results                                 
================================================================================
Dep. Variable:     monthly_vaccinations   No. Observations:                   67
Model:                   ARIMA(1, 1, 1)   Log Likelihood               -1626.501
Date:                  Wed, 09 Oct 2024   AIC                           3259.002
Time:                          10:37:42   BIC                           3265.571
Sample:                               0   HQIC                          3261.598
                                   - 67                                         
Covariance Type:                    opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.7409      0.247     -3.004      0.003      -1.224      -0.257
ma.L1         -0.3883      0.354     -1.097      0.273      -1.082       0.306
sigma2      1.572e+20        nan        nan        nan         nan         nan
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):              3066.55
Prob(Q):                              0.96   Prob(JB):                         0.00
Heteroskedasticity (H):     48327991804.89   Skew:                            -4.92
Prob(H) (two-sided):                  0.00   Kurtosis:                        34.91
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 7.51e+54. Standard errors may be unstable.
Evaluation for monthly_vaccinations - MSE: 1.2871689230308561e+21, RMSE: 35877136494.30311
C:\Users\wxdy\AppData\Local\Temp\ipykernel_10508\1881911332.py:17: InterpolationWarning: The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_test = kpss(timeseries, regression='c')
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
d:\anaconda\envs\TIL6022\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
  return get_prediction_index(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image